Set-up

Built with R 4.0.3
Last saved on 03 February 2022 at 18:16

Performing the PCA

TEC data import

TxBcounts <- readRDS(here("FullMDA", "TxBcounts3.rds"))
colnames(TxBcounts)
##  [1] "Filename" "Country"  "Series"   "Level"    "Register" "Words"   
##  [7] "ACT"      "AMP"      "ASPECT"   "AWL"      "BEMA"     "CAUSE"   
## [13] "CC"       "COMM"     "COND"     "CONT"     "CUZ"      "DEMO"    
## [19] "DMA"      "DOAUX"    "DT"       "EMPH"     "EX"       "EXIST"   
## [25] "FPP1P"    "FPP1S"    "FPUH"     "FREQ"     "HDG"      "IN"      
## [31] "JJAT"     "JJPR"     "LD"       "MDCA"     "MDCO"     "MDNE"    
## [37] "MDWO"     "MDWS"     "MENTAL"   "NCOMP"    "NN"       "OCCUR"   
## [43] "PASS"     "PEAS"     "PIT"      "PLACE"    "POLITE"   "POS"     
## [49] "PROG"     "QUAN"     "QUPR"     "RB"       "RP"       "SPLIT"   
## [55] "SPP2"     "STPR"     "THATD"    "THRC"     "THSC"     "TIME"    
## [61] "TPP3P"    "TPP3S"    "TTR"      "VBD"      "VBG"      "VBN"     
## [67] "VIMP"     "VPRT"     "WHQU"     "WHSC"     "XX0"      "YNQU"
nrow(TxBcounts)
## [1] 1961
TxBzlogcounts <- readRDS(here("FullMDA", "TxBzlogcounts.rds")) 
nrow(TxBzlogcounts)
## [1] 1961
colnames(TxBzlogcounts)
##  [1] "ACT"    "AMP"    "ASPECT" "AWL"    "BEMA"   "CAUSE"  "CC"     "COMM"  
##  [9] "COND"   "CONT"   "CUZ"    "DEMO"   "DMA"    "DOAUX"  "DT"     "EMPH"  
## [17] "EX"     "EXIST"  "FPP1P"  "FPP1S"  "FPUH"   "FREQ"   "HDG"    "IN"    
## [25] "JJAT"   "JJPR"   "LD"     "MDCA"   "MDCO"   "MDNE"   "MDWO"   "MDWS"  
## [33] "MENTAL" "NCOMP"  "NN"     "OCCUR"  "PASS"   "PEAS"   "PIT"    "PLACE" 
## [41] "POLITE" "POS"    "PROG"   "QUAN"   "QUPR"   "RB"     "RP"     "SPLIT" 
## [49] "SPP2"   "STPR"   "THATD"  "THRC"   "THSC"   "TIME"   "TPP3P"  "TPP3S" 
## [57] "TTR"    "VBD"    "VBG"    "VBN"    "VIMP"   "VPRT"   "WHQU"   "WHSC"  
## [65] "XX0"    "YNQU"
TxBdata <- cbind(TxBcounts[,1:6], as.data.frame(TxBzlogcounts))
str(TxBdata)
## 'data.frame':    1961 obs. of  72 variables:
##  $ Filename: chr  "Achievers_A1_Instructional_0012.txt" "Solutions_Pre-Intermediate_Instructional_0023.txt" "Achievers_A2_Personal_0003.txt" "Achievers_B1_plus_Informative_0007.txt" ...
##  $ Country : Factor w/ 3 levels "France","Germany",..: 3 3 3 3 1 2 3 2 2 2 ...
##  $ Series  : Factor w/ 9 levels "Access","Achievers",..: 2 9 2 2 8 1 2 7 1 7 ...
##  $ Level   : Factor w/ 5 levels "A","B","C","D",..: 1 2 2 4 2 4 4 1 1 2 ...
##  $ Register: Factor w/ 5 levels "Conversation",..: 4 4 5 3 1 2 4 1 2 2 ...
##  $ Words   : int  931 889 979 690 694 547 967 927 840 1127 ...
##  $ ACT     : num  -0.231 0.933 0.597 1.262 -0.851 ...
##  $ AMP     : num  -0.658 -0.401 -0.427 -0.313 0.174 ...
##  $ ASPECT  : num  0.202 1.02 -0.527 0.713 -0.473 ...
##  $ AWL     : num  0.733 0.575 -0.638 0.933 -0.928 ...
##  $ BEMA    : num  -0.6836 -0.7093 -0.0337 -0.2838 0.9802 ...
##  $ CAUSE   : num  -0.69023 -0.39356 0.2673 0.00833 -0.35629 ...
##  $ CC      : num  -0.498 0.129 -0.125 1.536 -0.624 ...
##  $ COMM    : num  1.036 0.782 -0.551 0.239 -0.82 ...
##  $ COND    : num  -0.1011 -0.6147 0.0464 -0.6147 0.9133 ...
##  $ CONT    : num  -0.612 -0.753 0.768 -0.595 0.725 ...
##  $ CUZ     : num  -0.5311 -0.5311 -0.0409 -0.5311 -0.5311 ...
##  $ DEMO    : num  -0.371 -0.332 -0.669 -0.835 0.639 ...
##  $ DMA     : num  -0.446 -0.529 -0.365 -0.529 1.106 ...
##  $ DOAUX   : num  0.534 0.288 1.312 -0.916 0.598 ...
##  $ DT      : num  0.0449 0.7521 -0.8778 -0.4304 0.6603 ...
##  $ EMPH    : num  -0.811 -0.688 0.34 -0.456 -0.222 ...
##  $ EX      : num  -0.467 -0.655 0.239 -0.655 0.491 ...
##  $ EXIST   : num  -0.545 -0.139 -0.251 1.643 0.361 ...
##  $ FPP1P   : num  -0.579 -0.579 1.197 1.125 0.971 ...
##  $ FPP1S   : num  -0.6252 -0.616 0.9428 -0.6465 -0.0813 ...
##  $ FPUH    : num  -0.47 -0.33 -0.344 -0.47 0.822 ...
##  $ FREQ    : num  0.1594 -0.0277 -0.6317 -0.3651 -0.5756 ...
##  $ HDG     : num  0.0544 -0.5244 -0.5244 0.2747 0.2704 ...
##  $ IN      : num  0.314 1.193 -0.552 0.49 -0.601 ...
##  $ JJAT    : num  -0.786 -0.715 -0.833 1.145 -0.147 ...
##  $ JJPR    : num  -0.651 -0.627 -0.111 -0.24 0.409 ...
##  $ LD      : num  0.997 -0.167 0.183 0.949 -0.732 ...
##  $ MDCA    : num  -0.56379 -0.24224 -0.14574 0.00326 -0.41384 ...
##  $ MDCO    : num  -0.541 -0.541 -0.541 -0.541 -0.541 ...
##  $ MDNE    : num  -0.632 -0.632 0.991 0.983 1.099 ...
##  $ MDWO    : num  -0.327 0.182 0.367 -0.532 -0.532 ...
##  $ MDWS    : num  -0.1848 -0.5195 -0.0988 1.2966 -0.2656 ...
##  $ MENTAL  : num  0.304 -0.103 0.558 -0.148 -0.226 ...
##  $ NCOMP   : num  0.423 -0.647 1.553 1.194 -0.467 ...
##  $ NN      : num  0.384 0.553 -0.413 0.889 -0.868 ...
##  $ OCCUR   : num  -0.488 0.319 -0.66 -0.66 -0.66 ...
##  $ PASS    : num  -0.475 -0.257 -0.313 0.157 -0.412 ...
##  $ PEAS    : num  -0.621 -0.621 -0.488 0.19 -0.621 ...
##  $ PIT     : num  -0.812 -0.918 0.382 0.469 1.239 ...
##  $ PLACE   : num  -0.588 -0.314 0.317 0.652 0.59 ...
##  $ POLITE  : num  -0.415 -0.415 0.399 -0.415 -0.041 ...
##  $ POS     : num  0.17589 -0.83652 -0.83652 -0.49089 0.00188 ...
##  $ PROG    : num  0.442 -0.0613 0.7701 0.6251 1.3626 ...
##  $ QUAN    : num  -0.7083 -0.0067 0.2901 -0.5901 -0.2334 ...
##  $ QUPR    : num  -0.5186 -0.6999 -0.4765 0.6157 0.0386 ...
##  $ RB      : num  -0.541 -0.974 0.32 -0.769 1.297 ...
##  $ RP      : num  -0.792 0.544 -0.631 0.723 -0.792 ...
##  $ SPLIT   : num  -0.637 -0.637 -0.389 0.71 0.179 ...
##  $ SPP2    : num  0.144 0.442 0.44 0.586 0.192 ...
##  $ STPR    : num  -0.223 0.975 0.82 -0.604 -0.604 ...
##  $ THATD   : num  1.018 -0.716 0.639 -0.716 -0.716 ...
##  $ THRC    : num  -0.523 -0.523 -0.523 -0.523 -0.523 ...
##  $ THSC    : num  -0.302 -0.113 -0.451 -0.128 -0.384 ...
##  $ TIME    : num  -0.3472 -0.5245 0.0688 -0.5329 0.1273 ...
##  $ TPP3P   : num  -0.259 -0.445 -0.662 0.537 -0.617 ...
##  $ TPP3S   : num  -0.47 -0.28 -0.594 -0.631 -0.365 ...
##  $ TTR     : num  -0.6587 -0.2769 -0.0125 1.2226 -0.3719 ...
##  $ VBD     : num  -0.669 -0.468 -0.621 -0.176 -0.594 ...
##  $ VBG     : num  -0.673 0.627 0.644 0.957 -0.741 ...
##  $ VBN     : num  -0.608 0.329 -0.608 0.568 -0.608 ...
##  $ VIMP    : num  1.086 0.978 -0.432 -0.434 -0.409 ...
##  $ VPRT    : num  -0.491 -0.586 0.893 0.295 0.934 ...
##  $ WHQU    : num  0.9144 0.604 0.4301 -0.7314 0.0563 ...
##  $ WHSC    : num  -0.454 -0.322 -0.613 0.851 -0.381 ...
##  $ XX0     : num  -0.716 -0.51 0.21 -0.52 0.681 ...
##  $ YNQU    : num  -0.0974 0.7753 0.0723 -0.7135 1.5438 ...
#saveRDS(TxBdata, here("FullMDA", "TxBdata.rds")) # Last saved 6 December 2021

Checking the factorability of data

colnames(TxBdata)
##  [1] "Filename" "Country"  "Series"   "Level"    "Register" "Words"   
##  [7] "ACT"      "AMP"      "ASPECT"   "AWL"      "BEMA"     "CAUSE"   
## [13] "CC"       "COMM"     "COND"     "CONT"     "CUZ"      "DEMO"    
## [19] "DMA"      "DOAUX"    "DT"       "EMPH"     "EX"       "EXIST"   
## [25] "FPP1P"    "FPP1S"    "FPUH"     "FREQ"     "HDG"      "IN"      
## [31] "JJAT"     "JJPR"     "LD"       "MDCA"     "MDCO"     "MDNE"    
## [37] "MDWO"     "MDWS"     "MENTAL"   "NCOMP"    "NN"       "OCCUR"   
## [43] "PASS"     "PEAS"     "PIT"      "PLACE"    "POLITE"   "POS"     
## [49] "PROG"     "QUAN"     "QUPR"     "RB"       "RP"       "SPLIT"   
## [55] "SPP2"     "STPR"     "THATD"    "THRC"     "THSC"     "TIME"    
## [61] "TPP3P"    "TPP3S"    "TTR"      "VBD"      "VBG"      "VBN"     
## [67] "VIMP"     "VPRT"     "WHQU"     "WHSC"     "XX0"      "YNQU"
ncol(TxBdata) - 6
## [1] 66
kmo <- KMO(TxBdata[,7:ncol(TxBdata)]) 
kmo # 0.86 (if conducted on counts all normalised per 100 words, this value drops to 0.61!)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = TxBdata[, 7:ncol(TxBdata)])
## Overall MSA =  0.86
## MSA for each item = 
##    ACT    AMP ASPECT    AWL   BEMA  CAUSE     CC   COMM   COND   CONT    CUZ 
##   0.65   0.95   0.90   0.92   0.93   0.69   0.90   0.87   0.75   0.92   0.95 
##   DEMO    DMA  DOAUX     DT   EMPH     EX  EXIST  FPP1P  FPP1S   FPUH   FREQ 
##   0.89   0.96   0.84   0.80   0.96   0.90   0.86   0.92   0.87   0.95   0.65 
##    HDG     IN   JJAT   JJPR     LD   MDCA   MDCO   MDNE   MDWO   MDWS MENTAL 
##   0.96   0.88   0.89   0.94   0.68   0.53   0.77   0.52   0.34   0.46   0.84 
##  NCOMP     NN  OCCUR   PASS   PEAS    PIT  PLACE POLITE    POS   PROG   QUAN 
##   0.79   0.88   0.90   0.94   0.91   0.96   0.97   0.96   0.64   0.92   0.92 
##   QUPR     RB     RP  SPLIT   SPP2   STPR  THATD   THRC   THSC   TIME  TPP3P 
##   0.95   0.95   0.81   0.93   0.83   0.81   0.86   0.89   0.95   0.93   0.80 
##  TPP3S    TTR    VBD    VBG    VBN   VIMP   VPRT   WHQU   WHSC    XX0   YNQU 
##   0.66   0.91   0.59   0.86   0.95   0.78   0.60   0.89   0.85   0.92   0.91
kmo$MSAi[order(kmo$MSAi)]
##      MDWO      MDWS      MDNE      MDCA       VBD      VPRT       POS       ACT 
## 0.3419233 0.4607994 0.5174560 0.5323388 0.5947193 0.6037051 0.6435553 0.6517842 
##      FREQ     TPP3S        LD     CAUSE      COND      MDCO      VIMP     NCOMP 
## 0.6518112 0.6595372 0.6831356 0.6886684 0.7472644 0.7681180 0.7782319 0.7914283 
##        DT     TPP3P      STPR        RP      SPP2    MENTAL     DOAUX      WHSC 
## 0.7989497 0.8040188 0.8106413 0.8137266 0.8288755 0.8388478 0.8400305 0.8545428 
##       VBG     EXIST     THATD      COMM     FPP1S        IN        NN      WHQU 
## 0.8566262 0.8621924 0.8625047 0.8685300 0.8726754 0.8819697 0.8845778 0.8866317 
##      JJAT      DEMO      THRC    ASPECT        CC        EX     OCCUR      PEAS 
## 0.8888528 0.8899441 0.8927221 0.8950231 0.8980297 0.9028360 0.9030450 0.9069610 
##       TTR      YNQU       AWL      QUAN     FPP1P      PROG       XX0      CONT 
## 0.9095693 0.9102920 0.9179605 0.9183916 0.9210382 0.9218841 0.9228772 0.9237714 
##      TIME      BEMA     SPLIT      PASS      JJPR       AMP      QUPR      THSC 
## 0.9280529 0.9298954 0.9340567 0.9364387 0.9376667 0.9495248 0.9502215 0.9505850 
##        RB      FPUH       CUZ       VBN       PIT       DMA    POLITE      EMPH 
## 0.9510642 0.9525589 0.9546196 0.9547362 0.9557132 0.9564074 0.9587834 0.9597464 
##       HDG     PLACE 
## 0.9627588 0.9655892
# Remove first feature with MSAs of < 0.5
TxBdata <- TxBdata %>% 
  select(-c(MDWO))

kmo <- KMO(TxBdata[,7:ncol(TxBdata)]) 
kmo # 0.87
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = TxBdata[, 7:ncol(TxBdata)])
## Overall MSA =  0.87
## MSA for each item = 
##    ACT    AMP ASPECT    AWL   BEMA  CAUSE     CC   COMM   COND   CONT    CUZ 
##   0.66   0.95   0.89   0.92   0.93   0.70   0.91   0.87   0.80   0.93   0.96 
##   DEMO    DMA  DOAUX     DT   EMPH     EX  EXIST  FPP1P  FPP1S   FPUH   FREQ 
##   0.89   0.96   0.84   0.80   0.96   0.90   0.86   0.92   0.87   0.95   0.65 
##    HDG     IN   JJAT   JJPR     LD   MDCA   MDCO   MDNE   MDWS MENTAL  NCOMP 
##   0.96   0.88   0.89   0.94   0.69   0.61   0.77   0.58   0.55   0.85   0.81 
##     NN  OCCUR   PASS   PEAS    PIT  PLACE POLITE    POS   PROG   QUAN   QUPR 
##   0.89   0.91   0.94   0.90   0.96   0.97   0.96   0.64   0.93   0.92   0.95 
##     RB     RP  SPLIT   SPP2   STPR  THATD   THRC   THSC   TIME  TPP3P  TPP3S 
##   0.95   0.81   0.93   0.83   0.82   0.87   0.89   0.95   0.92   0.80   0.66 
##    TTR    VBD    VBG    VBN   VIMP   VPRT   WHQU   WHSC    XX0   YNQU 
##   0.91   0.63   0.85   0.96   0.81   0.65   0.89   0.85   0.92   0.91
kmo$MSAi[order(kmo$MSAi)] # All individual MSA > 0.5
##      MDWS      MDNE      MDCA       VBD       POS      VPRT      FREQ       ACT 
## 0.5465989 0.5809599 0.6146009 0.6323342 0.6414034 0.6451452 0.6501807 0.6573716 
##     TPP3S        LD     CAUSE      MDCO      COND        DT     TPP3P      VIMP 
## 0.6624572 0.6852487 0.6976734 0.7674031 0.7956649 0.7987725 0.8040846 0.8076928 
##     NCOMP        RP      STPR      SPP2     DOAUX    MENTAL       VBG      WHSC 
## 0.8091170 0.8116498 0.8191502 0.8267635 0.8362975 0.8468678 0.8537894 0.8544238 
##     EXIST     THATD     FPP1S      COMM        IN        NN      DEMO      WHQU 
## 0.8622285 0.8662124 0.8722478 0.8722598 0.8816688 0.8869328 0.8886727 0.8897047 
##      THRC      JJAT    ASPECT      PEAS        EX     OCCUR        CC       TTR 
## 0.8915424 0.8933355 0.8935492 0.9002544 0.9018388 0.9059682 0.9105868 0.9139092 
##      YNQU       AWL      QUAN     FPP1P      TIME       XX0      CONT      PROG 
## 0.9143984 0.9173308 0.9176126 0.9199363 0.9221310 0.9241552 0.9277727 0.9293376 
##      BEMA     SPLIT      PASS      JJPR      THSC       AMP        RB      QUPR 
## 0.9296225 0.9339730 0.9360995 0.9387083 0.9470548 0.9476720 0.9502436 0.9509730 
##      FPUH       PIT       VBN       DMA       CUZ    POLITE      EMPH       HDG 
## 0.9530703 0.9550031 0.9552492 0.9567070 0.9581991 0.9594241 0.9608348 0.9619206 
##     PLACE 
## 0.9657210

Choosing the number of principal components to retain and excluding features with low final communalites

https://rdrr.io/cran/FactorAssumptions/man/communalities_optimal_solution.html

#TxBdata <- readRDS(here("FullMDA", "TxBdata.rds")) %>% select(-MDWO)
#colnames(TxBdata)

# Plot screen plot
#png(here("Plots", "screeplot-TEC-only.png"), width = 20, height= 12, units = "cm", res = 300)
scree(TxBdata[,7:ncol(TxBdata)], factors = FALSE, pc = TRUE) # Retain six components

dev.off()
## null device 
##           1
# Perform PCA
pca1 <- psych::principal(TxBdata[,7:ncol(TxBdata)], 
                         nfactors = 6)
pca1$loadings
## 
## Loadings:
##        RC1    RC2    RC4    RC6    RC3    RC5   
## ACT    -0.387  0.260  0.111  0.105  0.147       
## AMP     0.405  0.368                       0.148
## ASPECT -0.432                      -0.222  0.303
## AWL    -0.662  0.402        -0.458         0.147
## BEMA    0.832  0.148                       0.102
## CAUSE   0.111         0.108         0.386 -0.135
## CC     -0.202  0.653 -0.276         0.172  0.106
## COMM   -0.665 -0.353        -0.151        -0.165
## COND                  0.499         0.129       
## CONT    0.849 -0.183  0.236  0.185              
## CUZ     0.256  0.422                       0.149
## DEMO    0.369 -0.163  0.221  0.192  0.149       
## DMA     0.782 -0.324  0.138                     
## DOAUX   0.121 -0.566  0.175 -0.212         0.261
## DT     -0.571 -0.181         0.317 -0.235  0.321
## EMPH    0.581  0.111  0.376  0.129         0.113
## EX      0.355  0.231 -0.263  0.325  0.101  0.247
## EXIST          0.499        -0.157         0.101
## FPP1P   0.598                0.200  0.123       
## FPP1S   0.724 -0.142  0.274  0.185 -0.137       
## FPUH    0.748 -0.348                      -0.138
## FREQ           0.127                       0.433
## HDG     0.276         0.177  0.219         0.117
## IN     -0.677  0.412                       0.187
## JJAT           0.515  0.234  0.134         0.382
## JJPR    0.593  0.276  0.125                0.204
## LD     -0.319  0.117 -0.244 -0.695  0.118       
## MDCA    0.197 -0.182         0.177  0.603       
## MDCO           0.217  0.338  0.178 -0.321 -0.115
## MDNE    0.142         0.345         0.123       
## MDWS    0.178         0.462         0.123 -0.117
## MENTAL -0.449 -0.423  0.289 -0.241  0.134  0.149
## NCOMP          0.249        -0.299  0.385 -0.112
## NN     -0.530  0.284 -0.517 -0.408  0.225 -0.148
## OCCUR  -0.136  0.503               -0.201       
## PASS           0.715  0.121 -0.174              
## PEAS           0.482  0.445        -0.190       
## PIT     0.599  0.201  0.133  0.167              
## PLACE   0.544  0.110         0.316        -0.104
## POLITE  0.668 -0.323                0.104 -0.131
## POS                                       -0.468
## PROG    0.299         0.263  0.229 -0.113 -0.174
## QUAN    0.359         0.321  0.412         0.377
## QUPR    0.188  0.137  0.387  0.277              
## RB      0.441  0.187  0.315  0.425 -0.269       
## RP     -0.178  0.207  0.214  0.281        -0.377
## SPLIT   0.243  0.560  0.369                     
## SPP2   -0.255 -0.590  0.239 -0.170  0.465       
## STPR   -0.103               -0.268              
## THATD   0.131 -0.131  0.501 -0.117 -0.135  0.212
## THRC   -0.128  0.348  0.192         0.162  0.123
## THSC           0.493  0.297  0.141         0.161
## TIME    0.359  0.268  0.109  0.193        -0.262
## TPP3P          0.531 -0.103  0.138  0.112       
## TPP3S   0.128  0.263         0.147 -0.571 -0.352
## TTR            0.750  0.153        -0.122 -0.189
## VBD     0.118  0.452         0.266 -0.691 -0.183
## VBG    -0.225  0.512  0.279 -0.195         0.164
## VBN            0.586        -0.323 -0.142       
## VIMP   -0.728 -0.511 -0.106 -0.252  0.174       
## VPRT    0.627                       0.533  0.135
## WHQU   -0.221 -0.674        -0.333  0.139  0.106
## WHSC   -0.511  0.288  0.219         0.239 -0.232
## XX0     0.674         0.282  0.208              
## YNQU    0.246 -0.650        -0.249         0.151
## 
##                   RC1   RC2   RC4   RC6   RC3   RC5
## SS loadings    11.240 8.383 3.370 3.099 2.871 2.108
## Proportion Var  0.173 0.129 0.052 0.048 0.044 0.032
## Cumulative Var  0.173 0.302 0.354 0.401 0.446 0.478
pca1$communality %>% sort(.) # If features with communalities of < 0.2 are removed, we remove STPR, HDG, MDNE and CAUSE
##       STPR       MDNE        HDG      CAUSE       FREQ       THRC        POS 
## 0.08680591 0.16599754 0.17031066 0.19385220 0.21559975 0.21665202 0.24146645 
##       PROG        ACT       DEMO       MDWS        CUZ       COND       QUPR 
## 0.25772748 0.26338996 0.27063897 0.27686168 0.27687863 0.27739424 0.28622322 
##      EXIST       MDCO      NCOMP      OCCUR       TIME     ASPECT      TPP3P 
## 0.29399921 0.30928840 0.31480148 0.31708189 0.31918395 0.33002812 0.33354566 
##        AMP         RP      THATD       THSC         EX      FPP1P      PLACE 
## 0.33518074 0.34193421 0.36279318 0.38651020 0.42556202 0.42852216 0.42992831 
##        PIT        VBG       PEAS       MDCA      DOAUX        VBN       JJPR 
## 0.45207315 0.45554303 0.47436604 0.47786754 0.47949968 0.48281382 0.48762746 
##       JJAT       WHSC      SPLIT       EMPH       QUAN     MENTAL      TPP3S 
## 0.48782476 0.50334350 0.51457713 0.52622378 0.55407414 0.56217178 0.56414393 
##       PASS       YNQU     POLITE         RB         CC        XX0         DT 
## 0.56500214 0.57919758 0.58209654 0.58283536 0.58388767 0.58932610 0.61919662 
##       COMM       WHQU        TTR      FPP1S         IN         LD       FPUH 
## 0.62403733 0.64487002 0.64917506 0.67309250 0.67535202 0.67672838 0.70390909 
##       VPRT       SPP2       BEMA        DMA        VBD        AWL       CONT 
## 0.70823740 0.71930226 0.73708708 0.74191227 0.79994657 0.84672034 0.84847556 
##         NN       VIMP 
## 0.86821643 0.90139093
TxBdataforPCA <- TxBdata %>% 
  select(-c(STPR, MDNE, HDG, CAUSE))

kmo <- KMO(TxBdataforPCA[,7:ncol(TxBdataforPCA)]) 
kmo # 0.88
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = TxBdataforPCA[, 7:ncol(TxBdataforPCA)])
## Overall MSA =  0.88
## MSA for each item = 
##    ACT    AMP ASPECT    AWL   BEMA     CC   COMM   COND   CONT    CUZ   DEMO 
##   0.67   0.95   0.89   0.92   0.93   0.91   0.88   0.78   0.93   0.96   0.89 
##    DMA  DOAUX     DT   EMPH     EX  EXIST  FPP1P  FPP1S   FPUH   FREQ     IN 
##   0.96   0.84   0.79   0.96   0.90   0.86   0.92   0.87   0.95   0.65   0.89 
##   JJAT   JJPR     LD   MDCA   MDCO   MDWS MENTAL  NCOMP     NN  OCCUR   PASS 
##   0.89   0.94   0.69   0.64   0.79   0.54   0.85   0.82   0.89   0.90   0.94 
##   PEAS    PIT  PLACE POLITE    POS   PROG   QUAN   QUPR     RB     RP  SPLIT 
##   0.91   0.95   0.97   0.96   0.64   0.93   0.91   0.95   0.95   0.82   0.93 
##   SPP2  THATD   THRC   THSC   TIME  TPP3P  TPP3S    TTR    VBD    VBG    VBN 
##   0.82   0.86   0.90   0.95   0.92   0.81   0.66   0.92   0.65   0.86   0.95 
##   VIMP   VPRT   WHQU   WHSC    XX0   YNQU 
##   0.82   0.66   0.89   0.86   0.92   0.91
kmo$MSAi[order(kmo$MSAi)] # All individual MSA > 0.5
##      MDWS      MDCA       POS      FREQ       VBD     TPP3S      VPRT       ACT 
## 0.5366469 0.6420343 0.6429012 0.6462548 0.6533315 0.6642081 0.6647081 0.6651354 
##        LD      COND        DT      MDCO     TPP3P        RP     NCOMP      VIMP 
## 0.6862060 0.7828310 0.7921304 0.7928956 0.8110997 0.8197204 0.8210077 0.8240245 
##      SPP2     DOAUX    MENTAL      WHSC       VBG     THATD     EXIST     FPP1S 
## 0.8241715 0.8366812 0.8483976 0.8550631 0.8566951 0.8635491 0.8638716 0.8709469 
##      COMM        NN        IN      WHQU      DEMO    ASPECT      JJAT      THRC 
## 0.8781870 0.8859333 0.8881073 0.8890239 0.8904559 0.8930911 0.8947296 0.8985048 
##        EX     OCCUR      PEAS        CC      YNQU      QUAN       AWL      TIME 
## 0.9002604 0.9040632 0.9060428 0.9121949 0.9133540 0.9145440 0.9189537 0.9189799 
##       XX0     FPP1P       TTR      CONT      PROG      BEMA     SPLIT      PASS 
## 0.9199086 0.9209230 0.9215357 0.9277423 0.9284678 0.9296929 0.9347762 0.9369669 
##      JJPR      THSC        RB      QUPR       AMP      FPUH       PIT       VBN 
## 0.9383203 0.9472544 0.9492791 0.9508128 0.9518372 0.9521478 0.9542214 0.9547174 
##       DMA    POLITE       CUZ      EMPH     PLACE 
## 0.9568827 0.9590518 0.9592047 0.9622991 0.9655714
# Final number of features
ncol(TxBdataforPCA)-6
## [1] 61
#saveRDS(TxBdataforPCA, here("FullMDA", "TxBdataforPCA.rds")) # Last saved on 3 December 2021

https://stats.stackexchange.com/questions/59213/how-to-compute-varimax-rotated-principal-components-in-r

Testing the effect of rotating the components

This chunk was used when considering whether or not to rotate the components (see methods section)

# Comparing a rotated vs. a non-rotated solution

TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))
colnames(TxBdata)
ncol(TxBdata)-6

# No rotation
pca2 <- psych::principal(TxBdata[,7:ncol(TxBdata)], 
                         nfactors = 6, 
                         rotate = "none")

pca2$loadings

biplot.psych(pca2, 
             vars = TRUE, 
             choose=c(1,2),
             )

# Promax rotation
pca2.rotated <- psych::principal(TxBdata[,7:ncol(TxBdata)], 
                         nfactors = 6, 
                         rotate = "promax")

# This summary shows the component correlations which is particularly interesting
pca2.rotated

pca2.rotated$loadings

biplot.psych(pca2.rotated, vars = TRUE, choose=c(1,2))

Peforming the PCA

Using the full data (except outliers)

# Perform PCA on full data
TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))
nrow(TxBdata)
## [1] 1961

Using subsets of the data

Perform PCA on random subset of the data to test the stability of the solution. Re-running this line will generate a new subset of the TEC texts containing 2/3 of the texts randomly sampled.

TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds")) %>% 
  slice_sample(n = 1961*0.6, replace = FALSE)
nrow(TxBdata)
TxBdata$Filename[1:10]

colnames(TxBdata)
nrow(TxBdata) / (ncol(TxBdata)-6) # Check that there is enough data to conduct a PCA. This should be > 5 (see Friginal & Hardy 2014: 303–304).

Perform PCA on country subset of the data to test the stability of the solution:

TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds")) %>% 
  #filter(Country == "France")
  #filter(Country == "Germany")
  filter(Country == "Spain")

nrow(TxBdata)
TxBdata$Filename[1:10] # Check data

colnames(TxBdata)
nrow(TxBdata) / (ncol(TxBdata)-6) # Check that there is enough data to conduct a PCA. This should be > 5 (see Friginal & Hardy 2014: 303–304).

PCA code

pca <- prcomp(TxBdata[,7:ncol(TxBdata)], scale.=FALSE) # All quantitative variables for all TxB files except outliers
register  <- factor(TxBdata[,"Register"]) # Register
level <- factor(TxBdata[,"Level"]) # Textbook proficiency level

summary(register)
##  Conversation       Fiction   Informative Instructional      Personal 
##           583           285           363           644            86
summary(level)
##   A   B   C   D   E 
## 281 469 494 468 249
summary(pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.1693 1.7776 1.08902 1.00207 0.84288 0.76792 0.70295
## Proportion of Variance 0.2108 0.1416 0.05313 0.04499 0.03183 0.02642 0.02214
## Cumulative Proportion  0.2108 0.3524 0.40553 0.45051 0.48234 0.50876 0.53090
##                            PC8     PC9    PC10    PC11    PC12   PC13    PC14
## Standard deviation     0.67987 0.66896 0.62006 0.60634 0.59078 0.5882 0.58213
## Proportion of Variance 0.02071 0.02005 0.01722 0.01647 0.01564 0.0155 0.01518
## Cumulative Proportion  0.55160 0.57165 0.58888 0.60535 0.62098 0.6365 0.65167
##                           PC15    PC16    PC17    PC18   PC19    PC20   PC21
## Standard deviation     0.56560 0.55730 0.54868 0.54180 0.5325 0.52111 0.5153
## Proportion of Variance 0.01433 0.01391 0.01349 0.01315 0.0127 0.01217 0.0119
## Cumulative Proportion  0.66600 0.67991 0.69340 0.70655 0.7192 0.73142 0.7433
##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.51058 0.50478 0.49919 0.48542 0.48162 0.47183 0.46835
## Proportion of Variance 0.01168 0.01142 0.01116 0.01056 0.01039 0.00997 0.00983
## Cumulative Proportion  0.75499 0.76641 0.77757 0.78813 0.79852 0.80849 0.81832
##                           PC29    PC30    PC31    PC32    PC33    PC34    PC35
## Standard deviation     0.46383 0.46072 0.45286 0.44463 0.44124 0.43630 0.42986
## Proportion of Variance 0.00964 0.00951 0.00919 0.00886 0.00872 0.00853 0.00828
## Cumulative Proportion  0.82796 0.83747 0.84666 0.85551 0.86423 0.87276 0.88104
##                           PC36    PC37    PC38    PC39    PC40    PC41    PC42
## Standard deviation     0.42419 0.41741 0.41442 0.40058 0.39737 0.39455 0.38291
## Proportion of Variance 0.00806 0.00781 0.00769 0.00719 0.00707 0.00697 0.00657
## Cumulative Proportion  0.88910 0.89691 0.90460 0.91179 0.91887 0.92584 0.93241
##                           PC43    PC44    PC45    PC46    PC47   PC48    PC49
## Standard deviation     0.37583 0.36825 0.35414 0.34269 0.33970 0.3308 0.32069
## Proportion of Variance 0.00633 0.00608 0.00562 0.00526 0.00517 0.0049 0.00461
## Cumulative Proportion  0.93874 0.94481 0.95043 0.95569 0.96086 0.9658 0.97037
##                          PC50    PC51    PC52    PC53    PC54    PC55   PC56
## Standard deviation     0.3098 0.30595 0.28868 0.27485 0.25385 0.24257 0.2167
## Proportion of Variance 0.0043 0.00419 0.00373 0.00338 0.00289 0.00264 0.0021
## Cumulative Proportion  0.9747 0.97887 0.98260 0.98598 0.98887 0.99151 0.9936
##                           PC57    PC58    PC59   PC60    PC61
## Standard deviation     0.21094 0.18995 0.18519 0.1496 0.07340
## Proportion of Variance 0.00199 0.00162 0.00154 0.0010 0.00024
## Cumulative Proportion  0.99560 0.99722 0.99876 0.9998 1.00000
scree(TxBdata[,7:ncol(TxBdata)], factors = FALSE, pc = TRUE) # The scree plots pretty much always suggest that six components should be retained

Plotting PCA results

3D plots

## 3-D PCA plots https://cran.r-project.org/web/packages/pca3d/vignettes/pca3d.pdf ##
# 3-D plot
colours <- palette[c(1:3,8,7)] # without poetry
names(colours) <- c("Conversation", "Fiction", "Informative", "Instructional", "Personal")
scales::show_col(colours) # Check colours
pca3d(pca, 
      group = register,
      components = 1:3,
      #components = 4:6,
      show.ellipses=FALSE, 
      ellipse.ci=0.75,
      show.plane=FALSE,
      col = col,
      shape = "sphere",
      radius = 1,
      legend = "right")

snapshotPCA3d(here("Plots", "PCA_TxB_3Dsnapshot.png"))

names(colours) <- c("C", "B", "E", "A", "D") # To colour the dots according to the profiency level of the textbooks
pca3d(pca, 
      components = 4:6,
      group = level,
      show.ellipses=FALSE, 
      ellipse.ci=0.75,
      show.plane=FALSE,
      col = colours,
      shape = "sphere",
      radius = 0.8,
      legend = "right")

Bi-plots

# PCA for PCAtools

# This package requires the data to be formatted in a rather unconventional way so it needs to wrangled first.

#TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))

TxBdata2meta <- TxBdata[,1:6]
rownames(TxBdata2meta) <- TxBdata2meta$Filename
TxBdata2meta <- TxBdata2meta %>% select(-Filename)
head(TxBdata2meta)
##                                                   Country    Series Level
## Achievers_A1_Instructional_0012.txt                 Spain Achievers     A
## Solutions_Pre-Intermediate_Instructional_0023.txt   Spain Solutions     B
## Achievers_A2_Personal_0003.txt                      Spain Achievers     B
## Achievers_B1_plus_Informative_0007.txt              Spain Achievers     D
## POC_5e_Spoken_0003.txt                             France       POC     B
## Access_4_Narrative_0013.txt                       Germany    Access     D
##                                                        Register Words
## Achievers_A1_Instructional_0012.txt               Instructional   931
## Solutions_Pre-Intermediate_Instructional_0023.txt Instructional   889
## Achievers_A2_Personal_0003.txt                         Personal   979
## Achievers_B1_plus_Informative_0007.txt              Informative   690
## POC_5e_Spoken_0003.txt                             Conversation   694
## Access_4_Narrative_0013.txt                             Fiction   547
TxBdata2 = TxBdata
rownames(TxBdata2) <- TxBdata2$Filename
TxBdata2num <- as.data.frame(base::t(TxBdata2[,7:ncol(TxBdata2)]))
TxBdata2num[1:12,1:3] # Check sanity of data
##        Achievers_A1_Instructional_0012.txt
## ACT                             -0.2310193
## AMP                             -0.6575617
## ASPECT                           0.2019095
## AWL                              0.7330794
## BEMA                            -0.6836245
## CC                              -0.4984911
## COMM                             1.0357805
## COND                            -0.1011047
## CONT                            -0.6124532
## CUZ                             -0.5310538
## DEMO                            -0.3706952
## DMA                             -0.4459697
##        Solutions_Pre-Intermediate_Instructional_0023.txt
## ACT                                            0.9328214
## AMP                                           -0.4007729
## ASPECT                                         1.0196960
## AWL                                            0.5749224
## BEMA                                          -0.7093252
## CC                                             0.1285756
## COMM                                           0.7819071
## COND                                          -0.6146586
## CONT                                          -0.7525215
## CUZ                                           -0.5310538
## DEMO                                          -0.3318521
## DMA                                           -0.5288974
##        Achievers_A2_Personal_0003.txt
## ACT                        0.59748978
## AMP                       -0.42747830
## ASPECT                    -0.52672897
## AWL                       -0.63781078
## BEMA                      -0.03367772
## CC                        -0.12522295
## COMM                      -0.55058584
## COND                       0.04641396
## CONT                       0.76769734
## CUZ                       -0.04087171
## DEMO                      -0.66887418
## DMA                       -0.36474869
p <- PCAtools::pca(TxBdata2num, 
         metadata = TxBdata2meta,
         scale = FALSE)

colkey = c(Conversation="#BD241E", Fiction="#A18A33", Informative="#15274D", Instructional="#F9B921", Personal="#722672")
shapekey = c(A=1, B=2, C=6, D=0, E=5)

# Biplots to examine components more carefully
PCAtools::biplot(p,
                 x = "PC1",
                 y = "PC2",
                 lab = NULL, # Otherwise will try to label each data point!
                 #xlim = c(min(p$rotated$PC1)-0.5, max(p$rotated$PC1)+0.5),
                 #ylim = c(min(p$rotated$PC2)-0.5, max(p$rotated$PC2)+0.5),
                 colby = "Register",
                 pointSize = 2,
                 colkey = colkey,
                 shape = "Level",
                 shapekey = shapekey,
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 22,
                 legendPosition = 'right',
                 legendTitleSize = 22,
                 legendLabSize = 18, 
                 legendIconSize = 7) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

#ggsave(here("Plots", "PCA_TxB_BiplotPC1_PC2.svg"), width = 12, height = 10)

# Biplots to examine components more carefully
pRegisters <- PCAtools::biplot(p,
                 x = "PC3",
                 y = "PC4",
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Register",
                 pointSize = 2,
                 colkey = colkey,
                 shape = "Level",
                 shapekey = shapekey,
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 legendPosition = 'right',
                 legendTitleSize = 22,
                 legendLabSize = 18, 
                 legendIconSize = 7) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#ggsave(here("Plots", "PCA_TxB_BiplotPC3_PC4.svg"), width = 12, height = 10)
# Biplot with ellipses for Level rather than Register
colkey = c(A="#F9B921", B="#A18A33", C="#BD241E", D="#722672", E="#15274D")
shapekey = c(Conversation=1, Fiction=2, Informative=6, Instructional=0, Personal=5)

pLevels <- PCAtools::biplot(p,
                 x = "PC3",
                 y = "PC4",
                 lab = NULL, # Otherwise will try to label each data point!
                 #xlim = c(min(p$rotated$PC1)-0.5, max(p$rotated$PC1)+0.5),
                 #ylim = c(min(p$rotated$PC2)-0.5, max(p$rotated$PC2)+0.5),
                 colby = "Level",
                 pointSize = 2,
                 colkey = colkey,
                 shape = "Register",
                 shapekey = shapekey,
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 legendPosition = 'right',
                 legendTitleSize = 22,
                 legendLabSize = 18, 
                 legendIconSize = 7) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#ggsave(here("Plots", "PCA_TxB_BiplotPC3_PC4_Level.svg"), width = 12, height = 10)

# Save the two different representations of data points on PC2 and PC3 using the {patchwork} package
pRegisters / pLevels

#ggsave(here("Plots", "PCA_TxB_BiplotPC3_PC4_Register_vs_Level.svg"), width = 14, height = 20)

pLevels <- PCAtools::biplot(p,
                 x = "PC5",
                 y = "PC6",
                 lab = NULL, # Otherwise will try to label each data point!
                 #xlim = c(min(p$rotated$PC1)-0.5, max(p$rotated$PC1)+0.5),
                 #ylim = c(min(p$rotated$PC2)-0.5, max(p$rotated$PC2)+0.5),
                 colby = "Level",
                 pointSize = 2,
                 colkey = colkey,
                 shape = "Register",
                 shapekey = shapekey,
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 22,
                 legendPosition = 'right',
                 legendTitleSize = 22,
                 legendLabSize = 18, 
                 legendIconSize = 7) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#pLevels
#ggsave(here("Plots", "PCA_TxB_BiplotPC5_PC6_Level.svg"), width = 12, height = 10)

# Biplots to examine components more carefully

colkey = c(Conversation="#BD241E", Fiction="#A18A33", Informative="#15274D", Instructional="#F9B921", Personal="#722672")
shapekey = c(A=1, B=2, C=6, D=0, E=5)

pRegisters <- PCAtools::biplot(p,
                 x = "PC5",
                 y = "PC6",
                 lab = NULL, # Otherwise will try to label each data point!
                 #xlim = c(min(p$rotated$PC1)-0.5, max(p$rotated$PC1)+0.5),
                 #ylim = c(min(p$rotated$PC2)-0.5, max(p$rotated$PC2)+0.5),
                 colby = "Register",
                 pointSize = 2,
                 colkey = colkey,
                 shape = "Level",
                 shapekey = shapekey,
                 showLoadings = FALSE,
                 ellipse = TRUE,
                 axisLabSize = 22,
                 legendPosition = 'right',
                 legendTitleSize = 22,
                 legendLabSize = 18, 
                 legendIconSize = 7) +
  theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#ggsave("Plots/PCA_TxB_BiplotPC5_PC6.svg", width = 12, height = 10)

# Save the two different representations of data points on PC4 and PC5 using the {patchwork} package
pRegisters / pLevels

#ggsave(here("Plots", "PCA_TxB_BiplotPC5_PC6_Register_vs_Level.svg"), width = 14, height = 20)
## Very slow, open in zoomed out window!
# Add legend manually? Yes (take it from the biplot code below), let's not waste too much time, here. Or use Evert's mvar.pairs plot (though that also requires manual axis annotation)

PCAtools::pairsplot(p,
                 triangle = FALSE,
                 components = 1:6,
                 ncol = 3,
                 pointSize = 0.5,
                 lab = NULL, # Otherwise will try to label each data point!
                 colby = "Register",
                 colkey = colkey,
                 shape = "Level",
                 shapekey = shapekey,
                 legendPosition = "bottom")

#ggsave(here("Plots", "PCA_TxB_pairsplot.svg"), width = 15, height = 15)

Feature contributions (loadings) on each component

TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))

pca <- prcomp(TxBdata[,7:ncol(TxBdata)], scale.=FALSE) # All quantitative variables for all TxB files

# The rotated data that represents the observations / samples is stored in rotated, while the variable loadings are stored in loadings

loadings <- as.data.frame(pca$rotation[,1:6])
smallToZero <- function(x) {if_else(x < 0.05 & x > -0.05, 0, x)}
loadings %>% 
  filter_all(any_vars(. > abs(0.05))) %>% 
  mutate_all(smallToZero) %>% 
  round(3)
##           PC1    PC2    PC3    PC4    PC5    PC6
## ACT     0.080 -0.105  0.000 -0.101  0.000 -0.180
## ASPECT  0.100  0.000  0.139  0.000 -0.231  0.076
## AWL     0.216 -0.159 -0.121 -0.134  0.000  0.165
## BEMA   -0.217  0.000 -0.215  0.000 -0.078  0.080
## CC      0.052 -0.208 -0.191  0.000 -0.109 -0.123
## COMM    0.201  0.088  0.136  0.000  0.165  0.000
## COND    0.000  0.000  0.107 -0.245  0.132  0.000
## CONT   -0.254  0.110  0.000 -0.061  0.000  0.000
## DEMO   -0.117  0.075  0.000 -0.092  0.000 -0.123
## DMA    -0.201  0.144  0.000  0.000  0.000  0.072
## DOAUX   0.000  0.200  0.055 -0.147 -0.085  0.226
## DT      0.119  0.000  0.306  0.000 -0.295 -0.131
## EMPH   -0.191  0.000  0.059 -0.136  0.000  0.085
## EX     -0.104 -0.053 -0.115  0.053 -0.262 -0.178
## EXIST   0.000 -0.150 -0.094 -0.092  0.000  0.089
## FPP1S  -0.228  0.073  0.076  0.000  0.052  0.083
## FPUH   -0.161  0.148 -0.094  0.067  0.069  0.077
## IN      0.172 -0.178  0.000 -0.079 -0.156 -0.146
## JJPR   -0.171 -0.058 -0.115 -0.108 -0.080  0.111
## LD      0.159  0.000 -0.259  0.000  0.120  0.320
## MDCA    0.000  0.104 -0.179 -0.092  0.000 -0.328
## MDCO    0.000 -0.101  0.216  0.000  0.105  0.056
## MDWS   -0.067  0.000  0.000 -0.165  0.187  0.000
## MENTAL  0.138  0.128  0.117 -0.254  0.053  0.087
## NCOMP   0.000 -0.052 -0.237 -0.147  0.189  0.000
## NN      0.204 -0.090 -0.293  0.112  0.000  0.000
## OCCUR   0.000 -0.182  0.000  0.000  0.000  0.099
## PASS    0.000 -0.219 -0.059 -0.055  0.067  0.110
## PEAS   -0.056 -0.174  0.134 -0.129  0.163  0.065
## PLACE  -0.163  0.000 -0.072  0.092  0.000 -0.184
## POLITE -0.141  0.130 -0.067  0.000  0.070  0.000
## POS     0.000  0.000  0.000  0.164  0.289  0.000
## PROG   -0.115  0.000  0.110  0.000  0.140 -0.063
## QUAN   -0.148  0.000  0.116 -0.194 -0.206 -0.125
## QUPR   -0.101 -0.053  0.157 -0.105  0.084 -0.073
## RB     -0.188 -0.075  0.204  0.000  0.000  0.000
## RP      0.000 -0.089  0.139  0.000  0.244 -0.257
## SPLIT  -0.105 -0.178  0.000 -0.162  0.080  0.000
## SPP2    0.097  0.219  0.000 -0.252  0.166 -0.140
## THATD  -0.054  0.000  0.160 -0.236  0.000  0.244
## THSC   -0.058 -0.167  0.074 -0.137  0.000  0.000
## TIME   -0.125 -0.077  0.000  0.061  0.166 -0.103
## TPP3S  -0.057 -0.113  0.134  0.302  0.125  0.116
## TTR     0.000 -0.257 -0.053  0.000  0.200  0.093
## VBD    -0.082 -0.201  0.226  0.297  0.000  0.144
## VBG     0.000 -0.176  0.000 -0.220  0.000  0.115
## VBN     0.000 -0.184 -0.073  0.000  0.000  0.212
## VIMP    0.247  0.154  0.000 -0.076  0.000  0.000
## VPRT   -0.155  0.054 -0.322 -0.221  0.000 -0.104
## WHQU    0.110  0.230  0.000 -0.092  0.000  0.126
## WHSC    0.114 -0.112  0.000 -0.150  0.245 -0.238
## XX0    -0.217  0.000  0.056 -0.055  0.000  0.071
## YNQU    0.000  0.234  0.000 -0.083  0.000  0.206
#write_last_clip()

# Compare frequencies of the individual features across different registers and levels

TxBcounts %>% 
  group_by(Register, Level) %>% 
  summarise(median(VBD), MAD(VBD)) %>% 
  mutate(across(where(is.numeric), round, 2)) %>% 
  as.data.frame()
## `summarise()` has grouped output by 'Register'. You can override using the `.groups` argument.
##         Register Level median(VBD) MAD(VBD)
## 1   Conversation     A        2.66     3.94
## 2   Conversation     B       14.88    11.76
## 3   Conversation     C       18.02    13.30
## 4   Conversation     D       18.36    13.68
## 5   Conversation     E       16.14     8.75
## 6        Fiction     A        1.72     2.55
## 7        Fiction     B       56.39    14.31
## 8        Fiction     C       63.33    12.17
## 9        Fiction     D       61.69    20.98
## 10       Fiction     E       54.13    24.37
## 11   Informative     A       22.66    28.74
## 12   Informative     B       32.27    20.97
## 13   Informative     C       31.63    25.39
## 14   Informative     D       39.24    25.97
## 15   Informative     E       32.84    18.82
## 16 Instructional     A        1.82     2.70
## 17 Instructional     B        5.75     4.87
## 18 Instructional     C        6.78     4.35
## 19 Instructional     D        7.09     4.63
## 20 Instructional     E        6.67     4.58
## 21      Personal     A       32.59    11.94
## 22      Personal     B       26.83    29.30
## 23      Personal     C       32.82    18.93
## 24      Personal     D       27.82    13.97
## 25      Personal     E       22.22    10.01
# Graph of variables
factoextra::fviz_pca_var(pca,
             axes = c(1,2),
             select.var = list(cos2 = 0.1),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

#ggsave(here("Plots", "fviz_pca_var_PC1_PC2.svg"), width = 11, height = 9)

factoextra::fviz_pca_var(pca,
             axes = c(3,2),
             select.var = list(contrib = 30),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#ggsave(here("Plots", "fviz_pca_var_PC3_PC2.svg"), width = 9, height = 8)

factoextra::fviz_pca_var(pca,
             axes = c(3,4),
             select.var = list(contrib = 30),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())

#ggsave(here("Plots", "fviz_pca_var_PC3_PC4.svg"), width = 9, height = 8)

factoextra::fviz_pca_var(pca,
             axes = c(5,6),
             select.var = list(cos2 = 0.05),
             col.var = "contrib", # Colour by contributions to the PC
             gradient.cols = c("#F9B921", "#DB241E", "#722672"),
             title = "",
             repel = TRUE, # Try to avoid too much text overlapping
             ggtheme = ggthemes::theme_few())

#ggsave(here("Plots", "fviz_pca_var_PC5_PC6.svg"), width = 9, height = 8)

Visualising results in shiny app

colnames(TxBdata)

res.pca <- FactoMineR::PCA(TxBdata, quali.sup = 1:5, quanti.sup = 6, scale.unit = FALSE, graph = FALSE, ncp = 6)

summary(res.pca, ncp = 6, nbelements = 6)

# For some reason I don't understand, the dimdesc() function only works if I eliminate the supplementary variables (although they obviously don't contribute to the dimensions anyway!)
res.pca2 <- FactoMineR::PCA(TxBdata[,7:ncol(TxBdata)], scale.unit = FALSE, graph = FALSE)
# 
dims <- dimdesc(res.pca2, proba = 0.0001)

dims

dims$Dim.1
dims$Dim.2
dims$Dim.3
dims$Dim.4
dims$Dim.5

plot(res.pca, choix = "ind", habillage = "Register", col.hab = colours[1:5], cex = 1.1, select = "contrib 10", title = "Graphe des individus")

plot(res.pca, choix="var", select="cos2 0.6")  # plot the variables with cos2 greater than 0.6

library(Factoshiny)
TxBdata2 <- TxBdata %>% 
  select(-Words, -Filename)

res.shiny <- PCAshiny(TxBdata2)

Exploring the dimensions of the model

Descriptive stats of dimensions

# http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/#pca-results-for-variables

#TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))

pca <- prcomp(TxBdata[,7:ncol(TxBdata)], scale.=FALSE) # All quantitative variables for all TxB files
register  <- factor(TxBdata[,"Register"]) # Register
level <- factor(TxBdata[,"Level"]) # Textbook proficiency level

summary(register)
##  Conversation       Fiction   Informative Instructional      Personal 
##           583           285           363           644            86
summary(level)
##   A   B   C   D   E 
## 281 469 494 468 249
summary(pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.1693 1.7776 1.08902 1.00207 0.84288 0.76792 0.70295
## Proportion of Variance 0.2108 0.1416 0.05313 0.04499 0.03183 0.02642 0.02214
## Cumulative Proportion  0.2108 0.3524 0.40553 0.45051 0.48234 0.50876 0.53090
##                            PC8     PC9    PC10    PC11    PC12   PC13    PC14
## Standard deviation     0.67987 0.66896 0.62006 0.60634 0.59078 0.5882 0.58213
## Proportion of Variance 0.02071 0.02005 0.01722 0.01647 0.01564 0.0155 0.01518
## Cumulative Proportion  0.55160 0.57165 0.58888 0.60535 0.62098 0.6365 0.65167
##                           PC15    PC16    PC17    PC18   PC19    PC20   PC21
## Standard deviation     0.56560 0.55730 0.54868 0.54180 0.5325 0.52111 0.5153
## Proportion of Variance 0.01433 0.01391 0.01349 0.01315 0.0127 0.01217 0.0119
## Cumulative Proportion  0.66600 0.67991 0.69340 0.70655 0.7192 0.73142 0.7433
##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.51058 0.50478 0.49919 0.48542 0.48162 0.47183 0.46835
## Proportion of Variance 0.01168 0.01142 0.01116 0.01056 0.01039 0.00997 0.00983
## Cumulative Proportion  0.75499 0.76641 0.77757 0.78813 0.79852 0.80849 0.81832
##                           PC29    PC30    PC31    PC32    PC33    PC34    PC35
## Standard deviation     0.46383 0.46072 0.45286 0.44463 0.44124 0.43630 0.42986
## Proportion of Variance 0.00964 0.00951 0.00919 0.00886 0.00872 0.00853 0.00828
## Cumulative Proportion  0.82796 0.83747 0.84666 0.85551 0.86423 0.87276 0.88104
##                           PC36    PC37    PC38    PC39    PC40    PC41    PC42
## Standard deviation     0.42419 0.41741 0.41442 0.40058 0.39737 0.39455 0.38291
## Proportion of Variance 0.00806 0.00781 0.00769 0.00719 0.00707 0.00697 0.00657
## Cumulative Proportion  0.88910 0.89691 0.90460 0.91179 0.91887 0.92584 0.93241
##                           PC43    PC44    PC45    PC46    PC47   PC48    PC49
## Standard deviation     0.37583 0.36825 0.35414 0.34269 0.33970 0.3308 0.32069
## Proportion of Variance 0.00633 0.00608 0.00562 0.00526 0.00517 0.0049 0.00461
## Cumulative Proportion  0.93874 0.94481 0.95043 0.95569 0.96086 0.9658 0.97037
##                          PC50    PC51    PC52    PC53    PC54    PC55   PC56
## Standard deviation     0.3098 0.30595 0.28868 0.27485 0.25385 0.24257 0.2167
## Proportion of Variance 0.0043 0.00419 0.00373 0.00338 0.00289 0.00264 0.0021
## Cumulative Proportion  0.9747 0.97887 0.98260 0.98598 0.98887 0.99151 0.9936
##                           PC57    PC58    PC59   PC60    PC61
## Standard deviation     0.21094 0.18995 0.18519 0.1496 0.07340
## Proportion of Variance 0.00199 0.00162 0.00154 0.0010 0.00024
## Cumulative Proportion  0.99560 0.99722 0.99876 0.9998 1.00000
## Access to the PCA results
pca$rotation[,1]
##          ACT          AMP       ASPECT          AWL         BEMA           CC 
##  0.080060346 -0.116127587  0.099543545  0.216440492 -0.217351695  0.051555936 
##         COMM         COND         CONT          CUZ         DEMO          DMA 
##  0.201198439 -0.011408054 -0.254470196 -0.085519298 -0.116742474 -0.201458765 
##        DOAUX           DT         EMPH           EX        EXIST        FPP1P 
## -0.007901611  0.119111349 -0.190546478 -0.104230420 -0.023496930 -0.166286363 
##        FPP1S         FPUH         FREQ           IN         JJAT         JJPR 
## -0.228153745 -0.161136362 -0.026758914  0.171985691 -0.057458796 -0.171374793 
##           LD         MDCA         MDCO         MDWS       MENTAL        NCOMP 
##  0.159464818 -0.041311476 -0.047633498 -0.066621945  0.137513754  0.038907288 
##           NN        OCCUR         PASS         PEAS          PIT        PLACE 
##  0.204121990  0.015900251 -0.006229030 -0.056468983 -0.185192182 -0.163070169 
##       POLITE          POS         PROG         QUAN         QUPR           RB 
## -0.141415809 -0.011521906 -0.114565231 -0.148468753 -0.100834781 -0.187688767 
##           RP        SPLIT         SPP2        THATD         THRC         THSC 
##  0.001734942 -0.105047453  0.096602054 -0.054135134  0.015899485 -0.057894629 
##         TIME        TPP3P        TPP3S          TTR          VBD          VBG 
## -0.124827074 -0.009218624 -0.056785938 -0.044364165 -0.082457830  0.040924578 
##          VBN         VIMP         VPRT         WHQU         WHSC          XX0 
##  0.027018470  0.246976702 -0.154619968  0.109596509  0.114186152 -0.217382666 
##         YNQU 
## -0.027774744
res.ind <- cbind(TxBdata[,1:5], as.data.frame(pca$x)[,1:6])
head(res.ind)
##                                            Filename Country    Series Level
## 1               Achievers_A1_Instructional_0012.txt   Spain Achievers     A
## 2 Solutions_Pre-Intermediate_Instructional_0023.txt   Spain Solutions     B
## 3                    Achievers_A2_Personal_0003.txt   Spain Achievers     B
## 4            Achievers_B1_plus_Informative_0007.txt   Spain Achievers     D
## 5                            POC_5e_Spoken_0003.txt  France       POC     B
## 6                       Access_4_Narrative_0013.txt Germany    Access     D
##        Register       PC1        PC2         PC3           PC4         PC5
## 1 Instructional  3.093368  1.9013518  0.05136536  0.1894348974  0.02132023
## 2 Instructional  3.323671  0.6341624  0.50300157  0.2640014332 -1.01850292
## 3      Personal -1.287205  1.7556911 -0.94319919 -1.0893089092  0.44173584
## 4   Informative  1.416627 -2.7239559 -1.76705268 -1.1605859275  1.22300895
## 5  Conversation -3.043701  2.3029358 -0.08889758  0.0002598051 -0.54563425
## 6       Fiction -1.029236 -1.4477043  1.30971484  2.1975802039 -1.57940462
##          PC6
## 1  0.9301099
## 2 -0.1214005
## 3  0.3882705
## 4 -0.9204889
## 5 -0.2713718
## 6 -1.0899312
res.ind %>% 
  group_by(Register) %>% 
  summarise_if(is.numeric, mean)
## # A tibble: 5 × 7
##   Register          PC1    PC2     PC3     PC4      PC5     PC6
##   <fct>           <dbl>  <dbl>   <dbl>   <dbl>    <dbl>   <dbl>
## 1 Conversation  -2.29    0.929 -0.141  -0.268  -0.0248   0.0639
## 2 Fiction       -0.851  -0.805  1.02    1.09    0.111   -0.0998
## 3 Informative    0.0572 -2.45  -0.830   0.0134 -0.0772   0.117 
## 4 Instructional  2.68    0.931  0.150  -0.242   0.00795 -0.0670
## 5 Personal      -1.92   -0.286 -0.0515 -0.0200  0.0672  -0.0938
res.ind %>% 
  group_by(Register, Level) %>% 
  summarise_if(is.numeric, mean) %>% 
  mutate(across(where(is.numeric), round, 2)) %>% 
  as.data.frame()
##         Register Level   PC1   PC2   PC3   PC4   PC5   PC6
## 1   Conversation     A -2.39  2.39 -1.23  0.71 -0.45 -0.01
## 2   Conversation     B -2.55  1.45 -0.16 -0.14 -0.14  0.09
## 3   Conversation     C -2.25  0.70  0.18 -0.41  0.09 -0.02
## 4   Conversation     D -2.10 -0.08  0.28 -0.73  0.17  0.09
## 5   Conversation     E -1.94 -0.18 -0.04 -0.90  0.32  0.32
## 6        Fiction     A -0.95  0.85 -0.54  1.48 -0.31 -0.46
## 7        Fiction     B -0.96 -0.32  0.96  1.45  0.02  0.00
## 8        Fiction     C -0.98 -0.81  1.62  1.23  0.26 -0.16
## 9        Fiction     D -0.71 -1.57  1.27  0.72  0.21 -0.01
## 10       Fiction     E -0.68 -1.60  1.21  0.63  0.24 -0.05
## 11   Informative     A -0.09 -1.11 -1.94  0.87 -0.88 -0.15
## 12   Informative     B  0.13 -1.95 -1.13  0.30 -0.34  0.16
## 13   Informative     C -0.02 -2.37 -0.68 -0.03 -0.06 -0.01
## 14   Informative     D  0.06 -2.89 -0.45 -0.19  0.06  0.10
## 15   Informative     E  0.17 -3.12 -0.78 -0.37  0.40  0.46
## 16 Instructional     A  2.89  1.55 -0.20  0.46 -0.34 -0.24
## 17 Instructional     B  2.64  1.09  0.13 -0.11 -0.03 -0.18
## 18 Instructional     C  2.59  0.99  0.28 -0.32 -0.07  0.01
## 19 Instructional     D  2.63  0.70  0.28 -0.49  0.12  0.07
## 20 Instructional     E  2.71  0.17  0.15 -0.76  0.47 -0.06
## 21      Personal     A -1.84  0.53 -1.11  1.21 -0.31  0.12
## 22      Personal     B -1.96  0.13 -0.37  0.31  0.12 -0.15
## 23      Personal     C -2.05 -0.46  0.52 -0.17  0.06 -0.03
## 24      Personal     D -1.89 -1.05  0.45 -0.63  0.39 -0.06
## 25      Personal     E -1.69 -0.83  0.03 -1.18 -0.14 -0.43
res.ind %>% 
  select(Register, Level, PC2) %>% 
  group_by(Register, Level) %>% 
  summarise_if(is.numeric, c(Median = median, MAD = mad))%>% 
  mutate(across(where(is.numeric), round, 2)) %>% 
  as.data.frame()
##         Register Level Median  MAD
## 1   Conversation     A   2.60 0.76
## 2   Conversation     B   1.60 1.29
## 3   Conversation     C   0.85 1.26
## 4   Conversation     D   0.20 1.68
## 5   Conversation     E  -0.19 1.28
## 6        Fiction     A   1.09 0.59
## 7        Fiction     B  -0.33 0.91
## 8        Fiction     C  -0.82 0.76
## 9        Fiction     D  -1.59 1.05
## 10       Fiction     E  -1.54 1.04
## 11   Informative     A  -1.00 0.75
## 12   Informative     B  -2.07 0.97
## 13   Informative     C  -2.53 1.06
## 14   Informative     D  -3.11 0.73
## 15   Informative     E  -3.29 0.76
## 16 Instructional     A   1.53 0.66
## 17 Instructional     B   1.21 0.72
## 18 Instructional     C   1.03 0.48
## 19 Instructional     D   0.74 0.59
## 20 Instructional     E   0.23 0.91
## 21      Personal     A   0.58 0.45
## 22      Personal     B   0.27 1.50
## 23      Personal     C  -0.76 0.67
## 24      Personal     D  -0.97 0.69
## 25      Personal     E  -1.48 0.95
# Search for example texts to illustrate results
res.ind %>% 
  #filter(PC3 > 2.5 & PC2 < -2) %>% 
  filter(PC4 < -2.5) %>% 
  select(Filename, PC3, PC4) %>% 
  mutate(across(where(is.numeric), round, 2))
##                                      Filename   PC3   PC4
## 1               GreenLine_5_Personal_0002.txt -0.90 -2.78
## 2                       NGL_4_Spoken_0007.txt  0.73 -2.61
## 3                        NM_2_Spoken_0012.txt -0.36 -2.67
## 4                  EIM_3_Informative_0002.txt -1.46 -2.51
## 5 Solutions_Intermediate_Plus_Spoken_0040.txt  0.24 -2.53
## 6         Achievers_B1_plus_Personal_0001.txt  0.74 -2.66

Mixed-effects models of dimension scores and plots

# Dimension 1
# Model with only Register as a fixed effect
lm1 <- lm(PC1 ~ Register, data = res.ind)
summary(lm1)
## 
## Call:
## lm(formula = PC1 ~ Register, data = res.ind)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.97018 -0.47497 -0.00695  0.44715  2.98546 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.29071    0.03006 -76.212  < 2e-16 ***
## RegisterFiction        1.43957    0.05245  27.444  < 2e-16 ***
## RegisterInformative    2.34787    0.04852  48.387  < 2e-16 ***
## RegisterInstructional  4.96575    0.04149 119.689  < 2e-16 ***
## RegisterPersonal       0.36737    0.08383   4.382 1.24e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7257 on 1956 degrees of freedom
## Multiple R-squared:  0.8883, Adjusted R-squared:  0.8881 
## F-statistic:  3889 on 4 and 1956 DF,  p-value: < 2.2e-16
# Models with Textbook series as random intercepts
md1 <- lmer(PC1 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md1Register <- lmer(PC1 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md1Level <- lmer(PC1 ~ Level + (1|Series), data = res.ind, REML = FALSE)

anova(md1, md1Register, md1Level)
## Data: res.ind
## Models:
## md1Register: PC1 ~ Register + (1 | Series)
## md1Level: PC1 ~ Level + (1 | Series)
## md1: PC1 ~ Register * Level + (1 | Series)
##             npar    AIC    BIC  logLik deviance Chisq Df Pr(>Chisq)    
## md1Register    7 4080.4 4119.4 -2033.2   4066.4                        
## md1Level       7 8532.6 8571.6 -4259.3   8518.6     0  0               
## md1           27 4073.5 4224.2 -2009.8   4019.5  4499 20  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md1) # Marginal R2 = 0.890
  PC 1
Predictors Estimates CI p
(Intercept) -2.37 -2.59 – -2.15 <0.001
Register [Fiction] 1.61 1.35 – 1.86 <0.001
Register [Informative] 2.24 1.97 – 2.50 <0.001
Register [Instructional] 5.29 5.10 – 5.47 <0.001
Register [Personal] 0.48 0.08 – 0.89 0.018
Level [B] -0.07 -0.25 – 0.10 0.402
Level [C] 0.12 -0.05 – 0.29 0.155
Level [D] 0.23 0.06 – 0.41 0.010
Level [E] 0.29 0.05 – 0.52 0.017
Register [Fiction] *
Level [B]
0.11 -0.21 – 0.43 0.501
Register [Informative] *
Level [B]
0.37 0.04 – 0.69 0.028
Register [Instructional]
* Level [B]
-0.14 -0.38 – 0.09 0.236
Register [Personal] *
Level [B]
0.03 -0.45 – 0.52 0.895
Register [Fiction] *
Level [C]
-0.25 -0.57 – 0.08 0.135
Register [Informative] *
Level [C]
-0.00 -0.32 – 0.31 0.983
Register [Instructional]
* Level [C]
-0.39 -0.63 – -0.15 0.001
Register [Personal] *
Level [C]
-0.22 -0.72 – 0.28 0.379
Register [Fiction] *
Level [D]
-0.05 -0.38 – 0.27 0.741
Register [Informative] *
Level [D]
-0.01 -0.33 – 0.30 0.928
Register [Instructional]
* Level [D]
-0.48 -0.72 – -0.23 <0.001
Register [Personal] *
Level [D]
-0.07 -0.61 – 0.46 0.782
Register [Fiction] *
Level [E]
-0.23 -0.60 – 0.14 0.224
Register [Informative] *
Level [E]
-0.02 -0.39 – 0.35 0.910
Register [Instructional]
* Level [E]
-0.53 -0.83 – -0.22 0.001
Register [Personal] *
Level [E]
-0.07 -0.70 – 0.55 0.821
Random Effects
σ2 0.45
τ00 Series 0.07
ICC 0.14
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.890 / 0.905
tab_model(md1Register) # Marginal R2 = 0.877
  PC 1
Predictors Estimates CI p
(Intercept) -2.28 -2.47 – -2.09 <0.001
Register [Fiction] 1.55 1.45 – 1.65 <0.001
Register [Informative] 2.34 2.25 – 2.43 <0.001
Register [Instructional] 4.99 4.92 – 5.07 <0.001
Register [Personal] 0.41 0.25 – 0.56 <0.001
Random Effects
σ2 0.46
τ00 Series 0.08
ICC 0.14
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.887 / 0.903
tab_model(md1Level) # Marginal R2 = 0.002
  PC 1
Predictors Estimates CI p
(Intercept) 0.08 -0.30 – 0.47 0.668
Level [B] -0.11 -0.43 – 0.21 0.495
Level [C] -0.10 -0.41 – 0.21 0.521
Level [D] 0.07 -0.24 – 0.39 0.645
Level [E] 0.14 -0.23 – 0.51 0.461
Random Effects
σ2 4.46
τ00 Series 0.20
ICC 0.04
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.002 / 0.044
# Tweak plot aestetics with: https://cran.r-project.org/web/packages/sjPlot/vignettes/custplot.html
# Colour customisation trick from: https://stackoverflow.com/questions/55598920/different-line-colors-in-forest-plot-output-from-sjplot-r-package

# Plot of fixed effects:
plot_model(md1Register, 
           type = "est",
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,8,7)],
           group.terms = c(1:5), 
           title = "",
           wrap.labels = 40,
           axis.title = "PC1 estimated coefficients") +
  theme_sjplot2() 

#ggsave(here("Plots", "TxB_PCA1_lmer_fixedeffects_Register.svg"), height = 3, width = 8)

Register_results <- emmeans(md1Register, "Register")
summary(Register_results)
##  Register       emmean    SE   df lower.CL upper.CL
##  Conversation  -2.2793 0.102 11.6   -2.502   -2.056
##  Fiction       -0.7267 0.106 13.9   -0.955   -0.498
##  Informative    0.0603 0.104 12.7   -0.165    0.286
##  Instructional  2.7141 0.101 11.3    2.492    2.937
##  Personal      -1.8734 0.122 25.5   -2.125   -1.622
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95
comparisons <- pairs(Register_results, adjust = "tukey")
comparisons
##  contrast                     estimate     SE   df t.ratio  p.value
##  Conversation - Fiction         -1.553 0.0508 1963  -30.535 <.0001 
##  Conversation - Informative     -2.340 0.0465 1961  -50.341 <.0001 
##  Conversation - Instructional   -4.993 0.0399 1961 -125.141 <.0001 
##  Conversation - Personal        -0.406 0.0791 1958   -5.134 <.0001 
##  Fiction - Informative          -0.787 0.0557 1962  -14.135 <.0001 
##  Fiction - Instructional        -3.441 0.0497 1962  -69.168 <.0001 
##  Fiction - Personal              1.147 0.0840 1958   13.645 <.0001 
##  Informative - Instructional    -2.654 0.0447 1957  -59.399 <.0001 
##  Informative - Personal          1.934 0.0816 1957   23.692 <.0001 
##  Instructional - Personal        4.587 0.0780 1957   58.820 <.0001 
## 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: tukey method for comparing a family of 5 estimates
#write_last_clip()
confint(comparisons)
##  contrast                     estimate     SE   df lower.CL upper.CL
##  Conversation - Fiction         -1.553 0.0508 1963   -1.691   -1.414
##  Conversation - Informative     -2.340 0.0465 1961   -2.466   -2.213
##  Conversation - Instructional   -4.993 0.0399 1961   -5.102   -4.884
##  Conversation - Personal        -0.406 0.0791 1958   -0.622   -0.190
##  Fiction - Informative          -0.787 0.0557 1962   -0.939   -0.635
##  Fiction - Instructional        -3.441 0.0497 1962   -3.577   -3.305
##  Fiction - Personal              1.147 0.0840 1958    0.917    1.376
##  Informative - Instructional    -2.654 0.0447 1957   -2.776   -2.532
##  Informative - Personal          1.934 0.0816 1957    1.711    2.156
##  Instructional - Personal        4.587 0.0780 1957    4.374    4.800
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 5 estimates
#write_last_clip()

TxBdata %>% 
  filter(Register == "Instructional") %>% 
  group_by(Level) %>% 
  summarise(median(VPRT))
## # A tibble: 5 × 2
##   Level `median(VPRT)`
##   <fct>          <dbl>
## 1 A             -0.585
## 2 B             -0.456
## 3 C             -0.450
## 4 D             -0.428
## 5 E             -0.279
# Plot of random effects:
plot_model(md1, 
           type = "re", # Option to visualise random effects
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = "bw",
           wrap.labels = 40,
           axis.title = "PC1 estimated coefficients") +
  theme_sjplot2()

#ggsave(here("Plots", "TxB_PCA1_lmer_randomeffects.svg"), height = 3, width = 8)

# Dimension 2
md2 <- lmer(PC2 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md2Register <- lmer(PC2 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md2Level <- lmer(PC2 ~ Level + (1|Series), data = res.ind, REML = FALSE)
anova(md2, md2Register, md2Level)
## Data: res.ind
## Models:
## md2Register: PC2 ~ Register + (1 | Series)
## md2Level: PC2 ~ Level + (1 | Series)
## md2: PC2 ~ Register * Level + (1 | Series)
##             npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## md2Register    7 6155.2 6194.3 -3070.6   6141.2                         
## md2Level       7 7366.3 7405.4 -3676.2   7352.3    0.0  0               
## md2           27 5338.6 5489.3 -2642.3   5284.6 2067.7 20  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md2) # Marginal R2 = 0.706
  PC 2
Predictors Estimates CI p
(Intercept) 2.48 2.20 – 2.76 <0.001
Register [Fiction] -1.76 -2.12 – -1.41 <0.001
Register [Informative] -3.49 -3.86 – -3.12 <0.001
Register [Instructional] -0.88 -1.14 – -0.62 <0.001
Register [Personal] -1.92 -2.47 – -1.36 <0.001
Level [B] -0.95 -1.18 – -0.71 <0.001
Level [C] -1.72 -1.96 – -1.49 <0.001
Level [D] -2.38 -2.62 – -2.13 <0.001
Level [E] -2.74 -3.06 – -2.41 <0.001
Register [Fiction] *
Level [B]
-0.14 -0.58 – 0.30 0.527
Register [Informative] *
Level [B]
0.12 -0.33 – 0.58 0.592
Register [Instructional]
* Level [B]
0.50 0.17 – 0.83 0.003
Register [Personal] *
Level [B]
0.53 -0.14 – 1.21 0.119
Register [Fiction] *
Level [C]
0.05 -0.40 – 0.50 0.828
Register [Informative] *
Level [C]
0.44 0.00 – 0.88 0.048
Register [Instructional]
* Level [C]
1.12 0.80 – 1.45 <0.001
Register [Personal] *
Level [C]
0.61 -0.08 – 1.30 0.083
Register [Fiction] *
Level [D]
-0.01 -0.46 – 0.43 0.947
Register [Informative] *
Level [D]
0.52 0.08 – 0.96 0.021
Register [Instructional]
* Level [D]
1.50 1.16 – 1.83 <0.001
Register [Personal] *
Level [D]
0.66 -0.08 – 1.39 0.080
Register [Fiction] *
Level [E]
0.24 -0.27 – 0.75 0.352
Register [Informative] *
Level [E]
0.49 -0.02 – 1.01 0.060
Register [Instructional]
* Level [E]
1.16 0.75 – 1.58 <0.001
Register [Personal] *
Level [E]
1.12 0.26 – 1.99 0.011
Random Effects
σ2 0.85
τ00 Series 0.10
ICC 0.10
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.706 / 0.737
tab_model(md2Register) # Marginal R2 = 0.558
  PC 2
Predictors Estimates CI p
(Intercept) 0.98 0.76 – 1.19 <0.001
Register [Fiction] -1.90 -2.07 – -1.73 <0.001
Register [Informative] -3.39 -3.55 – -3.24 <0.001
Register [Instructional] -0.04 -0.18 – 0.09 0.520
Register [Personal] -1.31 -1.58 – -1.05 <0.001
Random Effects
σ2 1.33
τ00 Series 0.09
ICC 0.06
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.558 / 0.585
tab_model(md2Level) # Marginal R2 = 0.201
  PC 2
Predictors Estimates CI p
(Intercept) 1.45 1.20 – 1.71 <0.001
Level [B] -0.94 -1.18 – -0.71 <0.001
Level [C] -1.46 -1.69 – -1.23 <0.001
Level [D] -2.14 -2.38 – -1.91 <0.001
Level [E] -2.57 -2.85 – -2.29 <0.001
Random Effects
σ2 2.47
τ00 Series 0.07
ICC 0.03
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.201 / 0.223
# Plot of fixed effects:
plot_model(md2, 
           type = "est",
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,8,7)],
           group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5), 
           title = "",
           wrap.labels = 40,
           axis.title = "PC2 estimated coefficients") +
  theme_sjplot2() 

#ggsave(here("Plots", "TxB_PCA2_lmer_fixedeffects.svg"), height = 8, width = 8)

#svg(here("Plots", "TxB_predicted_PC2_scores_interactions.svg"), height = 5, width = 8)
visreg(md2, xvar = "Level", by="Register", type = "conditional",
       line=list(col="darkred"), 
       xlab = "Textbook Level", ylab = "PC2"
       #,gg = TRUE
       ,layout=c(5,1)
)

dev.off()
## null device 
##           1
# Plots of random effects
## Random intercepts
plot_model(md2, 
           type = "re", # Option to visualise random effects
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = "bw",
           wrap.labels = 40,
           axis.title = "PC2 estimated coefficients") +
  theme_sjplot2()
#ggsave(here("Plots", "TxB_PCA2_lmer_randomeffects.svg"), height = 3, width = 8)

# Textbook Series-Register interactions
visreg::visreg(md2, "Register", by="Series", re.form=~(1|Series),
               ylab="PC2", line=list(col="darkred"))

visreg(md2, xvar = "Series", by="Level", type = "conditional", re.form=~(1|Series), 
       line=list(col="darkred"), xlab = "Textbook Series", ylab = "PC2",
       layout=c(1,5))

# Dimension 3
md3 <- lmer(PC3 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md3Register <- lmer(PC3 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md3Level <- lmer(PC3 ~ Level + (1|Series), data = res.ind, REML = FALSE)

anova(md3, md3Register, md3Level)
## Data: res.ind
## Models:
## md3Register: PC3 ~ Register + (1 | Series)
## md3Level: PC3 ~ Level + (1 | Series)
## md3: PC3 ~ Register * Level + (1 | Series)
##             npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## md3Register    7 5139.9 5179.0 -2563.0   5125.9                         
## md3Level       7 5530.9 5569.9 -2758.4   5516.9   0.00  0               
## md3           27 4597.8 4748.5 -2271.9   4543.8 973.05 20  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md3) # Marginal R2 = 0.429
  PC 3
Predictors Estimates CI p
(Intercept) -1.34 -1.62 – -1.05 <0.001
Register [Fiction] 0.78 0.48 – 1.07 <0.001
Register [Informative] -0.77 -1.08 – -0.47 <0.001
Register [Instructional] 1.03 0.82 – 1.24 <0.001
Register [Personal] 0.20 -0.26 – 0.66 0.385
Level [B] 1.05 0.85 – 1.25 <0.001
Level [C] 1.39 1.20 – 1.58 <0.001
Level [D] 1.50 1.30 – 1.70 <0.001
Level [E] 1.61 1.34 – 1.88 <0.001
Register [Fiction] *
Level [B]
0.44 0.08 – 0.81 0.017
Register [Informative] *
Level [B]
-0.16 -0.54 – 0.21 0.397
Register [Instructional]
* Level [B]
-0.71 -0.99 – -0.44 <0.001
Register [Personal] *
Level [B]
-0.25 -0.81 – 0.30 0.373
Register [Fiction] *
Level [C]
0.79 0.42 – 1.16 <0.001
Register [Informative] *
Level [C]
-0.18 -0.54 – 0.19 0.340
Register [Instructional]
* Level [C]
-0.95 -1.22 – -0.68 <0.001
Register [Personal] *
Level [C]
0.16 -0.41 – 0.73 0.584
Register [Fiction] *
Level [D]
0.34 -0.03 – 0.71 0.071
Register [Informative] *
Level [D]
0.00 -0.36 – 0.36 0.993
Register [Instructional]
* Level [D]
-1.01 -1.29 – -0.73 <0.001
Register [Personal] *
Level [D]
0.00 -0.61 – 0.61 0.998
Register [Fiction] *
Level [E]
0.36 -0.06 – 0.78 0.096
Register [Informative] *
Level [E]
-0.16 -0.58 – 0.27 0.463
Register [Instructional]
* Level [E]
-1.03 -1.38 – -0.69 <0.001
Register [Personal] *
Level [E]
-0.36 -1.07 – 0.36 0.329
Random Effects
σ2 0.58
τ00 Series 0.14
ICC 0.19
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.429 / 0.539
tab_model(md3Register) # Marginal R2 = 0.272
  PC 3
Predictors Estimates CI p
(Intercept) -0.21 -0.45 – 0.03 0.083
Register [Fiction] 1.27 1.14 – 1.40 <0.001
Register [Informative] -0.73 -0.85 – -0.61 <0.001
Register [Instructional] 0.29 0.19 – 0.39 <0.001
Register [Personal] 0.14 -0.07 – 0.34 0.188
Random Effects
σ2 0.79
τ00 Series 0.12
ICC 0.13
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.272 / 0.369
tab_model(md3Level) # Marginal R2 = 0.119
  PC 3
Predictors Estimates CI p
(Intercept) -0.93 -1.18 – -0.68 <0.001
Level [B] 0.77 0.62 – 0.91 <0.001
Level [C] 1.05 0.91 – 1.20 <0.001
Level [D] 1.10 0.96 – 1.25 <0.001
Level [E] 1.18 1.01 – 1.36 <0.001
Random Effects
σ2 0.96
τ00 Series 0.12
ICC 0.11
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.119 / 0.214
# Plot of fixed effects:
plot_model(md3, 
           type = "est",
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,8,7)],
           group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5), 
           title = "",
           wrap.labels = 40,
           axis.title = "PC3 estimated coefficients") +
  theme_sjplot2() 
#ggsave(here("Plots", "TxB_PCA3_lmer_fixedeffects.svg"), height = 8, width = 8)

#svg(here("Plots", "TxB_predicted_PC3_scores_interactions.svg"), height = 5, width = 8)
visreg(md3, xvar = "Level", by="Register", type = "conditional",
       line=list(col="darkred"), 
       xlab = "Textbook Level", ylab = "PC3"
       #,gg = TRUE
       ,layout=c(5,1)
)
dev.off()
## null device 
##           1
# Plot of random effects:
plot_model(md3, 
           type = "re", # Option to visualise random effects
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           color = "bw",
           wrap.labels = 40,
           axis.title = "PC3 estimated coefficients") +
  theme_sjplot2()
#ggsave(here("Plots", "TxB_PCA3_lmer_randomeffects.svg"), height = 3, width = 8)

# Textbook Series-Register interactions
visreg::visreg(md3, "Series", by="Register", re.form=~(1|Series),
               ylab="PC2", line=list(col="darkred"))

visreg(md2, xvar = "Level", by="Series", type = "conditional", re.form=~(1|Series), 
       line=list(col="darkred"), xlab = "Textbook Series", ylab = "PC2")

# Dimension 4
md4 <- lmer(PC4 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md4Register <- lmer(PC4 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md4Level <- lmer(PC4 ~ Level + (1|Series), data = res.ind, REML = FALSE)

anova(md4, md4Register, md4Level)
## Data: res.ind
## Models:
## md4Register: PC4 ~ Register + (1 | Series)
## md4Level: PC4 ~ Level + (1 | Series)
## md4: PC4 ~ Register * Level + (1 | Series)
##             npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## md4Register    7 5034.0 5073.0 -2510.0   5020.0                         
## md4Level       7 5085.2 5124.3 -2535.6   5071.2   0.00  0               
## md4           27 4446.0 4596.7 -2196.0   4392.0 679.17 20  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md4) # Marginal R2 = 0.401
  PC 4
Predictors Estimates CI p
(Intercept) 0.77 0.55 – 1.00 <0.001
Register [Fiction] 0.74 0.46 – 1.02 <0.001
Register [Informative] 0.16 -0.13 – 0.46 0.278
Register [Instructional] -0.24 -0.44 – -0.04 0.021
Register [Personal] 0.47 0.03 – 0.91 0.036
Level [B] -0.78 -0.97 – -0.59 <0.001
Level [C] -1.15 -1.33 – -0.96 <0.001
Level [D] -1.42 -1.62 – -1.23 <0.001
Level [E] -1.87 -2.12 – -1.61 <0.001
Register [Fiction] *
Level [B]
0.86 0.51 – 1.21 <0.001
Register [Informative] *
Level [B]
0.26 -0.10 – 0.62 0.160
Register [Instructional]
* Level [B]
0.24 -0.02 – 0.51 0.069
Register [Personal] *
Level [B]
-0.10 -0.64 – 0.43 0.702
Register [Fiction] *
Level [C]
0.83 0.47 – 1.19 <0.001
Register [Informative] *
Level [C]
0.27 -0.08 – 0.61 0.136
Register [Instructional]
* Level [C]
0.37 0.11 – 0.63 0.005
Register [Personal] *
Level [C]
-0.25 -0.80 – 0.30 0.371
Register [Fiction] *
Level [D]
0.64 0.29 – 1.00 <0.001
Register [Informative] *
Level [D]
0.35 0.00 – 0.70 0.047
Register [Instructional]
* Level [D]
0.47 0.20 – 0.74 0.001
Register [Personal] *
Level [D]
-0.39 -0.98 – 0.19 0.190
Register [Fiction] *
Level [E]
0.83 0.43 – 1.24 <0.001
Register [Informative] *
Level [E]
0.47 0.06 – 0.88 0.024
Register [Instructional]
* Level [E]
0.48 0.15 – 0.82 0.004
Register [Personal] *
Level [E]
-0.69 -1.38 – -0.00 0.050
Random Effects
σ2 0.54
τ00 Series 0.07
ICC 0.11
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.401 / 0.469
tab_model(md4Register) # Marginal R2 = 0.203
  PC 4
Predictors Estimates CI p
(Intercept) -0.24 -0.39 – -0.08 0.002
Register [Fiction] 1.33 1.20 – 1.45 <0.001
Register [Informative] 0.30 0.18 – 0.41 <0.001
Register [Instructional] 0.05 -0.05 – 0.15 0.355
Register [Personal] 0.24 0.04 – 0.44 0.016
Random Effects
σ2 0.75
τ00 Series 0.04
ICC 0.05
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.203 / 0.244
tab_model(md4Level) # Marginal R2 = 0.171
  PC 4
Predictors Estimates CI p
(Intercept) 0.81 0.59 – 1.04 <0.001
Level [B] -0.51 -0.64 – -0.38 <0.001
Level [C] -0.86 -0.99 – -0.73 <0.001
Level [D] -1.07 -1.20 – -0.94 <0.001
Level [E] -1.41 -1.57 – -1.26 <0.001
Random Effects
σ2 0.77
τ00 Series 0.09
ICC 0.11
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.171 / 0.259
# Plot of fixed effects:
plot_model(md4, 
           type = "est",
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,8,7)],
           group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5), 
           title = "",
           wrap.labels = 40,
           axis.title = "PC4 estimated coefficients") +
  theme_sjplot2() 
#ggsave(here("Plots", "TxB_PCA4_lmer_fixedeffects.svg"), height = 8, width = 8)

#svg(here("Plots", "TxB_predicted_PC4_scores_interactions.svg"), height = 5, width = 8)
visreg(md4, xvar = "Level", by="Register", type = "conditional",
       line=list(col="darkred"), 
       xlab = "Textbook Level", ylab = "PC4"
       #,gg = TRUE
       ,layout=c(5,1)
)
dev.off()
## null device 
##           1
# Plot of random effects:
plot_model(md4, 
           type = "re", # Option to visualise random effects
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           color = "bw",
           wrap.labels = 40,
           axis.title = "PC4 estimated coefficients") +
  theme_sjplot2()

#ggsave(here("Plots", "TxB_PCA4_lmer_randomeffects.svg"), height = 3, width = 8)

# Dimension 5
md5 <- lmer(PC5 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md5Register <- lmer(PC5 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md5Level <- lmer(PC5 ~ Level + (1|Series), data = res.ind, REML = FALSE)

anova(md5, md5Register, md5Level)
## Data: res.ind
## Models:
## md5Register: PC5 ~ Register + (1 | Series)
## md5Level: PC5 ~ Level + (1 | Series)
## md5: PC5 ~ Register * Level + (1 | Series)
##             npar    AIC    BIC  logLik deviance   Chisq Df Pr(>Chisq)  
## md5Register    7 4521.9 4561.0 -2253.9   4507.9                        
## md5Level       7 4390.9 4429.9 -2188.4   4376.9 131.035  0             
## md5           27 4393.5 4544.2 -2169.8   4339.5  37.304 20    0.01076 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md5) # Marginal R2 = 0.068
  PC 5
Predictors Estimates CI p
(Intercept) -0.36 -0.65 – -0.08 0.012
Register [Fiction] 0.04 -0.23 – 0.32 0.758
Register [Informative] -0.31 -0.60 – -0.01 0.040
Register [Instructional] 0.14 -0.06 – 0.34 0.182
Register [Personal] 0.14 -0.30 – 0.57 0.533
Level [B] 0.30 0.11 – 0.48 0.002
Level [C] 0.54 0.36 – 0.73 <0.001
Level [D] 0.62 0.43 – 0.81 <0.001
Level [E] 0.42 0.17 – 0.68 0.001
Register [Fiction] *
Level [B]
0.01 -0.33 – 0.36 0.953
Register [Informative] *
Level [B]
0.13 -0.22 – 0.49 0.470
Register [Instructional]
* Level [B]
-0.01 -0.27 – 0.25 0.921
Register [Personal] *
Level [B]
0.00 -0.52 – 0.53 0.987
Register [Fiction] *
Level [C]
0.01 -0.34 – 0.36 0.958
Register [Informative] *
Level [C]
0.31 -0.04 – 0.65 0.080
Register [Instructional]
* Level [C]
-0.24 -0.50 – 0.02 0.065
Register [Personal] *
Level [C]
-0.15 -0.69 – 0.39 0.588
Register [Fiction] *
Level [D]
-0.13 -0.48 – 0.22 0.478
Register [Informative] *
Level [D]
0.28 -0.06 – 0.63 0.107
Register [Instructional]
* Level [D]
-0.17 -0.43 – 0.10 0.210
Register [Personal] *
Level [D]
0.05 -0.53 – 0.62 0.878
Register [Fiction] *
Level [E]
-0.01 -0.41 – 0.39 0.951
Register [Informative] *
Level [E]
0.59 0.19 – 0.99 0.004
Register [Instructional]
* Level [E]
0.21 -0.11 – 0.54 0.199
Register [Personal] *
Level [E]
-0.40 -1.08 – 0.28 0.247
Random Effects
σ2 0.53
τ00 Series 0.14
ICC 0.21
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.068 / 0.264
tab_model(md5Register) # Marginal R2 = 0.001
  PC 5
Predictors Estimates CI p
(Intercept) 0.04 -0.22 – 0.30 0.776
Register [Fiction] 0.02 -0.09 – 0.13 0.682
Register [Informative] 0.03 -0.08 – 0.13 0.611
Register [Instructional] 0.06 -0.02 – 0.15 0.157
Register [Personal] 0.06 -0.12 – 0.23 0.514
Random Effects
σ2 0.57
τ00 Series 0.15
ICC 0.20
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.001 / 0.205
tab_model(md5Level) # Marginal R2 = 0.054
  PC 5
Predictors Estimates CI p
(Intercept) -0.34 -0.60 – -0.08 0.011
Level [B] 0.30 0.19 – 0.41 <0.001
Level [C] 0.49 0.38 – 0.60 <0.001
Level [D] 0.57 0.47 – 0.68 <0.001
Level [E] 0.57 0.44 – 0.70 <0.001
Random Effects
σ2 0.54
τ00 Series 0.14
ICC 0.21
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.054 / 0.250
# Plot of fixed effects:
plot_model(md5, 
           type = "est",
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,8,7)],
           group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5), 
           title = "",
           wrap.labels = 40,
           axis.title = "PC5 estimated coefficients") +
  theme_sjplot2() 

#ggsave(here("Plots", "TxB_PCA5_lmer_fixedeffects.svg"), height = 3, width = 8)

# Plot of random effects:
plot_model(md5, 
           type = "re", # Option to visualise random effects
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           color = "bw",
           wrap.labels = 40,
           axis.title = "PC5 estimated coefficients") +
  theme_sjplot2()

#ggsave(here("Plots", "TxB_PCA5_lmer_randomeffects.svg"), height = 3, width = 8)

# Dimension 6
md6 <- lmer(PC6 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md6Register <- lmer(PC6 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md6Level <- lmer(PC6 ~ Level + (1|Series), data = res.ind, REML = FALSE)

anova(md6, md6Register, md6Level)
## Data: res.ind
## Models:
## md6Register: PC6 ~ Register + (1 | Series)
## md6Level: PC6 ~ Level + (1 | Series)
## md6: PC6 ~ Register * Level + (1 | Series)
##             npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## md6Register    7 4141.3 4180.4 -2063.7   4127.3                         
## md6Level       7 4154.4 4193.5 -2070.2   4140.4  0.000  0               
## md6           27 4102.2 4252.9 -2024.1   4048.2 92.181 20  3.074e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md6) # Marginal R2 = 0.047
  PC 6
Predictors Estimates CI p
(Intercept) 0.12 -0.15 – 0.40 0.373
Register [Fiction] -0.26 -0.52 – -0.00 0.046
Register [Informative] -0.40 -0.67 – -0.13 0.004
Register [Instructional] -0.36 -0.55 – -0.17 <0.001
Register [Personal] -0.06 -0.47 – 0.34 0.770
Level [B] 0.14 -0.04 – 0.31 0.121
Level [C] 0.01 -0.16 – 0.18 0.916
Level [D] 0.00 -0.18 – 0.18 0.988
Level [E] 0.28 0.04 – 0.52 0.021
Register [Fiction] *
Level [B]
0.36 0.04 – 0.68 0.027
Register [Informative] *
Level [B]
0.28 -0.05 – 0.61 0.098
Register [Instructional]
* Level [B]
-0.04 -0.28 – 0.20 0.722
Register [Personal] *
Level [B]
-0.33 -0.82 – 0.16 0.184
Register [Fiction] *
Level [C]
0.20 -0.13 – 0.52 0.238
Register [Informative] *
Level [C]
0.21 -0.11 – 0.53 0.194
Register [Instructional]
* Level [C]
0.25 0.01 – 0.49 0.039
Register [Personal] *
Level [C]
-0.06 -0.56 – 0.45 0.825
Register [Fiction] *
Level [D]
0.37 0.04 – 0.69 0.027
Register [Informative] *
Level [D]
0.38 0.06 – 0.70 0.019
Register [Instructional]
* Level [D]
0.33 0.09 – 0.58 0.008
Register [Personal] *
Level [D]
0.05 -0.48 – 0.59 0.844
Register [Fiction] *
Level [E]
-0.09 -0.45 – 0.28 0.652
Register [Informative] *
Level [E]
0.49 0.12 – 0.86 0.010
Register [Instructional]
* Level [E]
-0.06 -0.37 – 0.24 0.679
Register [Personal] *
Level [E]
-0.58 -1.21 – 0.05 0.070
Random Effects
σ2 0.45
τ00 Series 0.13
ICC 0.23
N Series 9
Observations 1961
Marginal R2 / Conditional R2 0.047 / 0.264
# Plot of fixed effects:
plot_model(md6, 
           type = "est",
           show.intercept = TRUE,
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           colors = palette[c(1:3,8,7)],
           group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5), 
           title = "",
           wrap.labels = 40,
           axis.title = "PC6 estimated coefficients") +
  theme_sjplot2() 

#ggsave(here("Plots", "TxB_PCA6_lmer_fixedeffects.svg"), height = 3, width = 8)

# Plot of random effects:
plot_model(md6, 
           type = "re", # Option to visualise random effects
           show.values=TRUE, 
           show.p=TRUE,
           value.offset = .4,
           value.size = 3.5,
           color = "bw",
           wrap.labels = 40,
           axis.title = "PC6 estimated coefficients") +
  theme_sjplot2()

#ggsave(here("Plots", "TxB_PCA6_lmer_randomeffects.svg"), height = 3, width = 8)

# Boxplot visualisation

res.ind %>% 
  ggplot(aes(x = Register, y = PC2, fill = Level, facet = Register)) +
         geom_boxplot() +
  facet_grid()

ggplot(res.ind, aes(x=Register,y=PC2, fill = Level, colour = Register, facet = Register))+ # Or leave out "colour = Register" to keep the dots in black
  #geom_point(position = position_jitter(width = .15), size = .25)+
# note that here we need to set the x-variable to a numeric variable and bump it to get the boxplots to line up with the rainclouds. 
  geom_boxplot(aes(x = as.numeric(Register), y = PC2), colour = "BLACK") +
  ylab('PC2')+ 
  theme_minimal()+ 
  guides(fill = FALSE, colour = FALSE) +
  scale_colour_manual(values = colours)+
  scale_fill_manual(values = colours)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#ggsave(here("Plots", "TxB_PC2_RegisterLevelsBoxplots.svg"), width = 13, height = 8)